Análisis de https://www.nature.com/articles/srep00196.pdf

Podemos usar read_lines_chunked si el archivo original es grande:

library(tidyverse)
limpiar <- function(lineas,...){
  str_split(lineas, ',') %>% 
    keep(~.x[1] == 'EastAsian') %>%
    map(~.x[-1]) %>% # quitar tipo de cocina
    map(~.x[nchar(.x) > 0]) # quitar elementos vac{ios}
}
filtrado <- read_lines_chunked('../../datos/recetas/srep00196-s3.csv',
                    skip = 1, callback = ListCallback$new(limpiar))
recetas <-  filtrado %>% flatten
library(arules)
length(recetas)
[1] 2512
## No hacer mucho más chico que este soporte, pues tenemos relativamente
## pocas transacciones:
pars <- list(support = 0.05,  target = 'frequent itemsets',
             ext = TRUE)
ap_recetas <- apriori(recetas, parameter = pars)
Apriori

Parameter specification:

Algorithmic control:

Absolute minimum support count: 125 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[242 item(s), 2512 transaction(s)] done [0.00s].
sorting and recoding items ... [41 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 6 done [0.00s].
sorting transactions ... done [0.00s].
writing ... [628 set(s)] done [0.00s].
creating S4 object  ... done [0.00s].
length(ap_recetas)
[1] 628

Vemos los items frecuentes

frecs <- ap_recetas %>% subset(size(.) == 1 ) %>% sort(by = 'support') %>%
 DATAFRAME
DT::datatable(frecs %>% mutate_if(is.numeric, function(x) round(x, 3)))
Registered S3 methods overwritten by 'htmltools':
  method               from         
  print.html           tools:rstudio
  print.shiny.tag      tools:rstudio
  print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio

Y ahora examinamos combinaciones frecuentes de distintos tamaños

ap_recetas %>% 
  subset(size(.) == 2) %>%
  subset(support > 0.20) %>%
  sort(by = 'support') %>%
  inspect

Incluso hay algunas combinaciones de 4 ingredientes que ocurren con frecuencia alta: estos ingredientes son bases de salsas, combinaciones de condimentos:

ap_recetas %>% 
  subset(size(.) == 4) %>%
  subset(support > 0.10) %>%
  sort(by = 'support') %>%
  inspect
pars <- list(support = 0.01, confidence = 0.10,
             target = 'rules',
             ext = TRUE)
reglas_recetas <- apriori(recetas, parameter = pars)
Apriori

Parameter specification:

Algorithmic control:

Absolute minimum support count: 25 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[242 item(s), 2512 transaction(s)] done [0.00s].
sorting and recoding items ... [88 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 6 7 8 done [0.01s].
writing ... [50181 rule(s)] done [0.01s].
creating S4 object  ... done [0.01s].
agregar_hyperlift <- function(reglas, trans){
  quality(reglas) <- cbind(quality(reglas), 
    hyper_lift = interestMeasure(reglas, measure = "hyperLift", 
    transactions = trans))
  reglas
}
reglas_recetas <- agregar_hyperlift(reglas_recetas, recetas)

Análisis de pares comunes

library(arulesViz)
Loading required package: grid
Registered S3 method overwritten by 'seriation':
  method         from 
  reorder.hclust gclus
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
reglas_1 <- subset(reglas_recetas, hyper_lift > 1.1 & support > 0.1 & confidence > 0.40)
length(reglas_1)
[1] 213
reglas_tam_2 <- subset(reglas_1, size(reglas_1)==2)
#inspect(reglas_tam_2 %>% sort(by = 'hyper_lift')) 
plot(reglas_1 %>% subset(support > 0.2), engine = "plotly")
To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
library(tidygraph)

Attaching package: ‘tidygraph’

The following object is masked from ‘package:stats’:

    filter
library(ggraph)
frecs <- 
df_reglas <- reglas_tam_2 %>% DATAFRAME %>% rename(from=LHS, to=RHS) %>% data.frame
df_reglas$weight <- log(df_reglas$lift)
graph_1 <- as_tbl_graph(df_reglas) %>%
  mutate(centrality = centrality_degree(mode = "all")) 
set.seed(881)
ggraph(graph_1, layout = 'fr') +
  geom_edge_link(aes(alpha=lift), 
                 colour = 'red',
                 arrow = arrow(length = unit(4, 'mm'))) + 
  geom_node_point(aes(size = centrality, colour = centrality)) + 
  geom_node_text(aes(label = name), size=4,
                 colour = 'gray20', repel=TRUE) +
  theme_graph()

reglas_1 <- subset(reglas_recetas, hyper_lift > 1.5 & confidence > 0.1)
length(reglas_1)
[1] 11190
reglas_tam_2 <- subset(reglas_1, size(reglas_1)==2)
length(reglas_tam_2)
[1] 134
library(tidygraph)
library(ggraph)
df_reglas <- reglas_tam_2 %>% DATAFRAME %>% rename(from=LHS, to=RHS) %>% as_data_frame
`as_data_frame()` is deprecated, use `as_tibble()` (but mind the new semantics).
This warning is displayed once per session.
df_reglas$weight <- log(df_reglas$hyper_lift)
graph_1 <- as_tbl_graph(df_reglas) %>%
  mutate(centrality = centrality_degree(mode = "all")) 

ggraph(graph_1, layout = 'fr', start.temp=100) +
  geom_edge_link(aes(alpha=lift), 
                 colour = 'red',
                 arrow = arrow(length = unit(4, 'mm'))) + 
  geom_node_point(aes(size = centrality, colour = centrality)) + 
  geom_node_text(aes(label = name), size=4,
                 colour = 'gray20', repel=TRUE) +
  theme_graph()

Exportamos para examinar en Gephi:

write_csv(df_reglas %>% rename(source=from, target=to) %>%
            select(-count), 
          path='reglas.csv')

Nota

La combinación corn y starch puede deberse en parte a una separación incorrecta en el procesamiento de los datos (corn starch o maizena convertido en dos ingredientes, corn y starch):

df_reglas %>% filter(from == "{corn}", to == "{starch}")

La confianza es considerablemente alta, aunque tenemos pocos datos de esta combinación. Podemos examinar algunos ejemplos:

recetas %>% keep(~ "tomato" %in% .x & "corn" %in% .x) %>% head(10)
[[1]]
 [1] "tomato"        "vinegar"       "pork"          "celery_oil"    "leek"          "corn"          "black_pepper" 
 [8] "pepper"        "ginger"        "pea"           "garlic"        "soybean"       "soy_sauce"     "chicken_broth"
[15] "wine"         

[[2]]
 [1] "tomato"     "vinegar"    "pepper"     "celery_oil" "corn"       "cayenne"    "pork"       "garlic"    
 [9] "soybean"    "vegetable"  "coriander"  "rice"       "soy_sauce" 

[[3]]
[1] "tomato"     "vinegar"    "pork"       "celery_oil" "soy_sauce"  "ginger"     "garlic"     "sherry"    
[9] "corn"      

[[4]]
 [1] "pepper"        "celery_oil"    "starch"        "corn"          "ginger"        "garlic"        "soybean"      
 [8] "tomato"        "vinegar"       "beef"          "soy_sauce"     "cayenne"       "scallion"      "bell_pepper"  
[15] "vegetable_oil" "rice"          "wine"         

[[5]]
 [1] "tomato"     "vinegar"    "pork"       "celery_oil" "beef"       "soy_sauce"  "ginger"     "garlic"    
 [9] "corn"       "wine"      

[[6]]
 [1] "tomato"      "vinegar"     "pepper"      "lemon_juice" "celery_oil"  "sake"        "corn"        "pork"       
 [9] "ginger"      "honey"       "garlic"      "soybean"     "rice"        "soy_sauce"  

[[7]]
[1] "tomato"  "garlic"  "onion"   "bacon"   "corn"    "cayenne" "egg"    

[[8]]
 [1] "pork"              "green_bell_pepper" "celery_oil"        "starch"            "corn"             
 [6] "garlic"            "tomato"            "vinegar"           "onion"             "soy_sauce"        
[11] "cider"             "scallion"          "celery"            "pineapple"         "vegetable_oil"    
[16] "egg"              

[[9]]
 [1] "tomato"       "vinegar"      "pepper"       "celery_oil"   "roasted_pork" "soy_sauce"    "ginger"      
 [8] "honey"        "garlic"       "cinnamon"     "soybean"      "sherry"       "corn"         "oyster"      

[[10]]
 [1] "cane_molasses" "tomato"        "pork"          "celery_oil"    "vinegar"       "soy_sauce"     "pepper"       
 [8] "ginger"        "garlic"        "corn"         
LS0tCnRpdGxlOiAiQW7DoWxpc2lzIGRlIGluZ3JlZGllbnRlcyBlbiByZWNldGFzIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgpBbsOhbGlzaXMgZGUKaHR0cHM6Ly93d3cubmF0dXJlLmNvbS9hcnRpY2xlcy9zcmVwMDAxOTYucGRmCgpQb2RlbW9zIHVzYXIgKnJlYWRfbGluZXNfY2h1bmtlZCogc2kgZWwgYXJjaGl2byBvcmlnaW5hbCBlcyBncmFuZGU6CgpgYGB7ciwgbWVzc2FnZSA9IEZBTFNFfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGltcGlhciA8LSBmdW5jdGlvbihsaW5lYXMsLi4uKXsKICBzdHJfc3BsaXQobGluZWFzLCAnLCcpICU+JSAKICAgIGtlZXAofi54WzFdID09ICdFYXN0QXNpYW4nKSAlPiUKICAgIG1hcCh+LnhbLTFdKSAlPiUgIyBxdWl0YXIgdGlwbyBkZSBjb2NpbmEKICAgIG1hcCh+LnhbbmNoYXIoLngpID4gMF0pICMgcXVpdGFyIGVsZW1lbnRvcyB2YWN7aW9zfQp9CmZpbHRyYWRvIDwtIHJlYWRfbGluZXNfY2h1bmtlZCgnLi4vLi4vZGF0b3MvcmVjZXRhcy9zcmVwMDAxOTYtczMuY3N2JywKICAgICAgICAgICAgICAgICAgICBza2lwID0gMSwgY2FsbGJhY2sgPSBMaXN0Q2FsbGJhY2skbmV3KGxpbXBpYXIpKQpyZWNldGFzIDwtICBmaWx0cmFkbyAlPiUgZmxhdHRlbgpgYGAKCgpgYGB7ciwgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRX0KbGlicmFyeShhcnVsZXMpCmxlbmd0aChyZWNldGFzKQojIyBObyBoYWNlciBtdWNobyBtw6FzIGNoaWNvIHF1ZSBlc3RlIHNvcG9ydGUsIHB1ZXMgdGVuZW1vcyByZWxhdGl2YW1lbnRlCiMjIHBvY2FzIHRyYW5zYWNjaW9uZXM6CnBhcnMgPC0gbGlzdChzdXBwb3J0ID0gMC4wNSwgIHRhcmdldCA9ICdmcmVxdWVudCBpdGVtc2V0cycsCiAgICAgICAgICAgICBleHQgPSBUUlVFKQphcF9yZWNldGFzIDwtIGFwcmlvcmkocmVjZXRhcywgcGFyYW1ldGVyID0gcGFycykKbGVuZ3RoKGFwX3JlY2V0YXMpCmBgYAoKVmVtb3MgbG9zIGl0ZW1zIGZyZWN1ZW50ZXMKCmBgYHtyfQpmcmVjcyA8LSBhcF9yZWNldGFzICU+JSBzdWJzZXQoc2l6ZSguKSA9PSAxICkgJT4lIHNvcnQoYnkgPSAnc3VwcG9ydCcpICU+JQogREFUQUZSQU1FCkRUOjpkYXRhdGFibGUoZnJlY3MgJT4lIG11dGF0ZV9pZihpcy5udW1lcmljLCBmdW5jdGlvbih4KSByb3VuZCh4LCAzKSkpCmBgYAoKWSBhaG9yYSBleGFtaW5hbW9zIGNvbWJpbmFjaW9uZXMgZnJlY3VlbnRlcyBkZSBkaXN0aW50b3MgdGFtYcOxb3MKCmBgYHtyfQphcF9yZWNldGFzICU+JSAKICBzdWJzZXQoc2l6ZSguKSA9PSAyKSAlPiUKICBzdWJzZXQoc3VwcG9ydCA+IDAuMjApICU+JQogIHNvcnQoYnkgPSAnc3VwcG9ydCcpICU+JQogIGluc3BlY3QKYGBgCgpJbmNsdXNvIGhheSBhbGd1bmFzIGNvbWJpbmFjaW9uZXMgZGUgNCBpbmdyZWRpZW50ZXMgcXVlIG9jdXJyZW4gY29uIGZyZWN1ZW5jaWEgYWx0YToKZXN0b3MgaW5ncmVkaWVudGVzIHNvbiBiYXNlcyBkZSBzYWxzYXMsIGNvbWJpbmFjaW9uZXMgZGUgY29uZGltZW50b3M6CgpgYGB7cn0KYXBfcmVjZXRhcyAlPiUgCiAgc3Vic2V0KHNpemUoLikgPT0gNCkgJT4lCiAgc3Vic2V0KHN1cHBvcnQgPiAwLjEwKSAlPiUKICBzb3J0KGJ5ID0gJ3N1cHBvcnQnKSAlPiUKICBpbnNwZWN0CmBgYAoKCmBgYHtyfQpwYXJzIDwtIGxpc3Qoc3VwcG9ydCA9IDAuMDEsIGNvbmZpZGVuY2UgPSAwLjEwLAogICAgICAgICAgICAgdGFyZ2V0ID0gJ3J1bGVzJywKICAgICAgICAgICAgIGV4dCA9IFRSVUUpCnJlZ2xhc19yZWNldGFzIDwtIGFwcmlvcmkocmVjZXRhcywgcGFyYW1ldGVyID0gcGFycykKYGBgCgpgYGB7cn0KYWdyZWdhcl9oeXBlcmxpZnQgPC0gZnVuY3Rpb24ocmVnbGFzLCB0cmFucyl7CiAgcXVhbGl0eShyZWdsYXMpIDwtIGNiaW5kKHF1YWxpdHkocmVnbGFzKSwgCgloeXBlcl9saWZ0ID0gaW50ZXJlc3RNZWFzdXJlKHJlZ2xhcywgbWVhc3VyZSA9ICJoeXBlckxpZnQiLCAKCXRyYW5zYWN0aW9ucyA9IHRyYW5zKSkKICByZWdsYXMKfQpyZWdsYXNfcmVjZXRhcyA8LSBhZ3JlZ2FyX2h5cGVybGlmdChyZWdsYXNfcmVjZXRhcywgcmVjZXRhcykKYGBgCgoKIyMgQW7DoWxpc2lzIGRlIHBhcmVzIGNvbXVuZXMKCmBgYHtyfQpsaWJyYXJ5KGFydWxlc1ZpeikKcmVnbGFzXzEgPC0gc3Vic2V0KHJlZ2xhc19yZWNldGFzLCBoeXBlcl9saWZ0ID4gMS4xICYgc3VwcG9ydCA+IDAuMSAmIGNvbmZpZGVuY2UgPiAwLjQwKQpsZW5ndGgocmVnbGFzXzEpCnJlZ2xhc190YW1fMiA8LSBzdWJzZXQocmVnbGFzXzEsIHNpemUocmVnbGFzXzEpPT0yKQojaW5zcGVjdChyZWdsYXNfdGFtXzIgJT4lIHNvcnQoYnkgPSAnaHlwZXJfbGlmdCcpKSAKcGxvdChyZWdsYXNfMSAlPiUgc3Vic2V0KHN1cHBvcnQgPiAwLjIpLCBlbmdpbmUgPSAicGxvdGx5IikKYGBgCgpgYGB7ciwgZmlnLndpZHRoPTEwLCBmaWcuaGVpZ2h0PTh9CmxpYnJhcnkodGlkeWdyYXBoKQpsaWJyYXJ5KGdncmFwaCkKZnJlY3MgPC0gCmRmX3JlZ2xhcyA8LSByZWdsYXNfdGFtXzIgJT4lIERBVEFGUkFNRSAlPiUgcmVuYW1lKGZyb209TEhTLCB0bz1SSFMpICU+JSBkYXRhLmZyYW1lCmRmX3JlZ2xhcyR3ZWlnaHQgPC0gbG9nKGRmX3JlZ2xhcyRsaWZ0KQpncmFwaF8xIDwtIGFzX3RibF9ncmFwaChkZl9yZWdsYXMpICU+JQogIG11dGF0ZShjZW50cmFsaXR5ID0gY2VudHJhbGl0eV9kZWdyZWUobW9kZSA9ICJhbGwiKSkgCnNldC5zZWVkKDg4MSkKZ2dyYXBoKGdyYXBoXzEsIGxheW91dCA9ICdmcicpICsKICBnZW9tX2VkZ2VfbGluayhhZXMoYWxwaGE9bGlmdCksIAogICAgICAgICAgICAgICAgIGNvbG91ciA9ICdyZWQnLAogICAgICAgICAgICAgICAgIGFycm93ID0gYXJyb3cobGVuZ3RoID0gdW5pdCg0LCAnbW0nKSkpICsgCiAgZ2VvbV9ub2RlX3BvaW50KGFlcyhzaXplID0gY2VudHJhbGl0eSwgY29sb3VyID0gY2VudHJhbGl0eSkpICsgCiAgZ2VvbV9ub2RlX3RleHQoYWVzKGxhYmVsID0gbmFtZSksIHNpemU9NCwKICAgICAgICAgICAgICAgICBjb2xvdXIgPSAnZ3JheTIwJywgcmVwZWw9VFJVRSkgKwogIHRoZW1lX2dyYXBoKCkKYGBgCgoKYGBge3J9CnJlZ2xhc18xIDwtIHN1YnNldChyZWdsYXNfcmVjZXRhcywgaHlwZXJfbGlmdCA+IDEuNSAmIGNvbmZpZGVuY2UgPiAwLjEpCmxlbmd0aChyZWdsYXNfMSkKcmVnbGFzX3RhbV8yIDwtIHN1YnNldChyZWdsYXNfMSwgc2l6ZShyZWdsYXNfMSk9PTIpCmxlbmd0aChyZWdsYXNfdGFtXzIpCmBgYAoKYGBge3IsIGZpZy53aWR0aD0xMCwgZmlnLmhlaWdodD04fQpsaWJyYXJ5KHRpZHlncmFwaCkKbGlicmFyeShnZ3JhcGgpCmRmX3JlZ2xhcyA8LSByZWdsYXNfdGFtXzIgJT4lIERBVEFGUkFNRSAlPiUgcmVuYW1lKGZyb209TEhTLCB0bz1SSFMpICU+JSBhc19kYXRhX2ZyYW1lCmRmX3JlZ2xhcyR3ZWlnaHQgPC0gbG9nKGRmX3JlZ2xhcyRoeXBlcl9saWZ0KQpncmFwaF8xIDwtIGFzX3RibF9ncmFwaChkZl9yZWdsYXMpICU+JQogIG11dGF0ZShjZW50cmFsaXR5ID0gY2VudHJhbGl0eV9kZWdyZWUobW9kZSA9ICJhbGwiKSkgCgpnZ3JhcGgoZ3JhcGhfMSwgbGF5b3V0ID0gJ2ZyJywgc3RhcnQudGVtcD0xMDApICsKICBnZW9tX2VkZ2VfbGluayhhZXMoYWxwaGE9bGlmdCksIAogICAgICAgICAgICAgICAgIGNvbG91ciA9ICdyZWQnLAogICAgICAgICAgICAgICAgIGFycm93ID0gYXJyb3cobGVuZ3RoID0gdW5pdCg0LCAnbW0nKSkpICsgCiAgZ2VvbV9ub2RlX3BvaW50KGFlcyhzaXplID0gY2VudHJhbGl0eSwgY29sb3VyID0gY2VudHJhbGl0eSkpICsgCiAgZ2VvbV9ub2RlX3RleHQoYWVzKGxhYmVsID0gbmFtZSksIHNpemU9NCwKICAgICAgICAgICAgICAgICBjb2xvdXIgPSAnZ3JheTIwJywgcmVwZWw9VFJVRSkgKwogIHRoZW1lX2dyYXBoKCkKYGBgCgpFeHBvcnRhbW9zIHBhcmEgZXhhbWluYXIgZW4gR2VwaGk6CgoKYGBge3J9CndyaXRlX2NzdihkZl9yZWdsYXMgJT4lIHJlbmFtZShzb3VyY2U9ZnJvbSwgdGFyZ2V0PXRvKSAlPiUKICAgICAgICAgICAgc2VsZWN0KC1jb3VudCksIAogICAgICAgICAgcGF0aD0ncmVnbGFzLmNzdicpCmBgYAoKCiMjIyBOb3RhCgpMYSBjb21iaW5hY2nDs24gX2Nvcm5fIHkgX3N0YXJjaF8gcHVlZGUgZGViZXJzZSBlbiBwYXJ0ZSBhIHVuYSBzZXBhcmFjacOzbiBpbmNvcnJlY3RhIGVuIGVsIApwcm9jZXNhbWllbnRvIGRlIGxvcyBkYXRvcyAoY29ybiBzdGFyY2ggbyBtYWl6ZW5hIGNvbnZlcnRpZG8gZW4gZG9zIGluZ3JlZGllbnRlcywgY29ybiB5IHN0YXJjaCk6CgpgYGB7cn0KZGZfcmVnbGFzICU+JSBmaWx0ZXIoZnJvbSA9PSAie2Nvcm59IiwgdG8gPT0gIntzdGFyY2h9IikKYGBgCgpMYSBjb25maWFuemEgZXMgY29uc2lkZXJhYmxlbWVudGUgYWx0YSwgYXVucXVlIHRlbmVtb3MgcG9jb3MgZGF0b3MgZGUgZXN0YSBjb21iaW5hY2nDs24uIFBvZGVtb3MgZXhhbWluYXIgYWxndW5vcyBlamVtcGxvczoKCmBgYHtyfQpyZWNldGFzICU+JSBrZWVwKH4gInRvbWF0byIgJWluJSAueCAmICJjb3JuIiAlaW4lIC54KSAlPiUgaGVhZCgxMCkKYGBgCgoK